## required packages
library(tidyverse)
library(xtable)


## read in the data,  this path will need to be modified for your system
## if opened as an R project with files in the same folder, this should work
working_dir <- getwd()
conf_data <- read_csv(str_c(working_dir, 
                            "/conf_meta_final.csv"))



#############################################################################################

#Table 1: Numbers of Conferences/Workshops by Subfields (p. 9)

#########################
## Summary percentages
##################


## simple count
# how to count the number of conferences by subfields

subfield_counts <- conf_data %>%
  group_by(subfield) %>%
  summarize(counts = n())


conf_subfield_counts <- xtable(subfield_counts, 
                               caption = "Numbers of Conferences/Workshops by Subfields",
                               label = "tab:conferences_subfield")

print(conf_subfield_counts, include.rownames = F)
## to save the .tex table
#print.xtable(conf_subfield_counts, 
#             file=str_c("conf_subfield_counts.tex"),
#             include.rownames = F)



##############################################################################################

## Descriptive statistics in Section 5 Results

#Our first result is the prevalence of codes. We found that 34 out of 177 surveyed conferences
#(19%) had codes. Limiting the events to strictly political science conferences and workshops,
#there were 25 out of 146 conferences with codes (17%) (p.12). 

## how many codes?
length(unique(conf_data$number)) # 177
length(conf_data$number[conf_data$coc_binary_variable == 1]) # 34

## percent with code
34/177 #19.2%

#excluding adjacent conferences
conf_data2 <- conf_data[conf_data$subfield!="Adjacent fields",]
# how many codes
length(unique(conf_data2$number)) #146
length(conf_data2$number[conf_data2$coc_binary_variable == 1]) #25

## percent with code, no adjacent fields
25/146 #17%

##############################################################################################

#The distribution of codes by subfield is shown in Appendix C, Figure 3
# Prevalence of Codes by Subfield, Ordered by Percentage with Codes

#histogram of conference by subfields
conf_data$subfield <- as.factor(conf_data$subfield)

conf_data_ggplot <- conf_data

conf_data_ggplot <- transform(conf_data_ggplot, subfield = reorder(subfield, coc_binary_variable==1))


subfield_hist <- conf_data_ggplot  %>%
  ggplot(aes(x = conf_data_ggplot$subfield, fill=as.factor(coc_binary_variable)))+
  geom_bar(width=0.7, position="dodge") + 
  scale_fill_grey() + theme_classic() +
  labs(
    x = "Subfield of Conferences",
    y = "Conference Counts",
    title = "",
    fill =  "Presence of Code"
  ) +
  theme(text = element_text(size = 16), legend.position="top") +
  #axis.text.x = element_text(angle=45)) 
  coord_flip()

subfield_hist

##############################################################################################


## Table 5: Top 5 Reused Codes in Appendix C

### read in the already calculated similarity data
## again, point to where you have the data saved
simil_scores <- read_csv(str_c(working_dir, 
                            "/code_similarity_scores_rep.csv"))

#### For each code, what is the highest average alignment score?

average_align <- simil_scores %>% 
  group_by(a) %>% 
  summarize(mean_align = mean(align)) %>% 
  arrange(desc(mean_align))

top5 <- average_align %>% 
  slice(1:5)

######## Which conferences are these?
### based on a merge with protected data,
### extracted the following conf names
conf_names <- c("ACM Web Science Conference", "Latin American Studies Association", 
                "American Political Science Association", "African Studies Association", 
                "Midwest Political Science Association")

df_top5 <- cbind(conf_names,top5[,2])
## top 5 scores table for appendix

colnames(df_top5) <- c("Conference Name", "Mean Pairwise Alignment Score")

## to tex
tex_table_reuse <- xtable(df_top5, 
                          caption = "Top 5 Reused Codes",
                          label = "tab:reused_5")

print(tex_table_reuse, include.rownames = F)


##############################################################################################

#For analysis, we binned the conference ages into three categories: older than 50 years, 25-50 years, and
#younger than 25 years (see Appendix C, Figure 4 for distribution) 

# Figure 4: Prevalence of Codes by Number of Years of Conference

#create categories  
#occurrences
conf_data <- conf_data %>% mutate(
  how_many_2 = case_when(
    how_many > 1 & how_many <= 25 ~ "1-25",
    how_many > 25 & how_many <= 50 ~ "25-50",
    how_many == ">50" ~ ">50"
  )
)

conf_data$how_many_2 <- as.factor(conf_data$how_many_2)
table(conf_data$how_many_2)


#histogram of conference ages
conf_data$how_many_3 <- conf_data$how_many_2
conf_data$how_many_3 <- factor(conf_data$how_many_3, levels=c("1-25", "25-50", ">50"))


history_hist <- conf_data %>% 
  drop_na(how_many_3) %>%
  ggplot(aes(x = how_many_3, fill=as.factor(coc_binary_variable)))+
  geom_bar(width=0.5) + 
  scale_fill_grey() + theme_classic() +
  labs(
    x = "Numbers of Years in Existence of Conferences",
    y = "Conference Counts",
    title = "",
    fill =  "Presence of Code"
  ) +
  theme(text = element_text(size = 16), legend.position="top")

history_hist

#26 conferences have a history of over 50 years long. Of those conferences, on average 65% are predicted to draft a code.
#In contrast, the probabilities drop significantly for conferences with a shorter history. Only
#about 14% of conferences fewer than 25 years old are predicted to have a code


##############################################################################################

# In terms of mode, 23% of the offline conferences have a code versus only 10% of online or hybrid conferences (p. 13).


# some descriptive
conf_data$virtual <- as.factor(conf_data$virtual)
conf_data$virtual2 <- ifelse(conf_data$virtual=="No", "No", "Yes")


#125 offline
sum(conf_data$virtual=="No",na.rm=T)
table(conf_data$coc_binary_variable[conf_data$virtual2=="No"])
#29 has code
29/125 #0.23

#online or hybrid (49)
table(conf_data$coc_binary_variable[conf_data$virtual2=="Yes"])
#5 has code
5/49  #0.1

##############################################################################################

## Table 6 in Appendix D 
## Table 6: OLS Regression: Relationships between the presence of a code of conduct and factors related to conference resources and leadership


#relationship between conference years and the possibility of having coc
f1 <- lm(coc_binary_variable ~ how_many_2-1, data = conf_data)
#summary(f1)
# conferences >50 are more likely to have code of conducts

f2 <- lm(coc_binary_variable ~ permanent_staff, data = conf_data)
#summary(f2)

f3 <- lm(coc_binary_variable ~ length_conf, data = conf_data)
summary(f3)
mean(conf_data$length_conf, na.rm=T)

f4 <- lm(coc_binary_variable ~ virtual2 - 1, data = conf_data)
summary(f4)
#added offline or online
#appendix

#23 versus 10 percent

table(conf_data$leadership_gender_prop!=0)
sum(is.na(conf_data$leadership_gender_prop=="NA"))
#42

#122 conferences have at least one women in their leadership
122/(177-42)

## Regarding gender in leadership, recall that conference leadership is on average 50% female. (p.13) 

mean(conf_data$leadership_gender_prop,na.rm=T)
#0.5
median(conf_data$leadership_gender_prop,na.rm=T)
#0.5

f5 <- lm(coc_binary_variable ~ leadership_gender_prop, data = conf_data)
summary(f5)

conf_data$commit_binary <- ifelse(conf_data$`Relevant committees`==0, 0, 1)

f6 <- lm(coc_binary_variable ~ commit_binary, data = conf_data)
summary(f6)

stargazer::stargazer(f1, f2, f3, f4, f5, f6,
                     star.cutoffs = c(0.05, 0.01, 0.001))

##############################################################################################
## Table 7 in Appendix D
## Table 7: Logistics Regression: Relationships between the presence of a code of conduct and
## factors related to conference resources and leadership

f1b <- glm(coc_binary_variable ~ how_many_2 - 1, data = conf_data, family = "binomial")
summary(f1b)
exp(coef(f1b))
# conferences >50 are more likely to have code of conducts

f2b <- glm(coc_binary_variable ~ permanent_staff, data = conf_data, family = "binomial")
summary(f2b)

f3b <- glm(coc_binary_variable ~ length_conf, data = conf_data, family = "binomial")
summary(f3b)

f4b <- glm(coc_binary_variable ~ virtual2 - 1, data = conf_data, family = "binomial")
summary(f4b)
#exp(coef(f4b))

f5b <- glm(coc_binary_variable ~ leadership_gender_prop, data = conf_data, family = "binomial")
summary(f5b)

f6b <- glm(coc_binary_variable ~ commit_binary, data = conf_data, family = "binomial")
summary(f6b)

stargazer::stargazer(f1b, f2b, f3b, f4b, f5b, f6b,
                     star.cutoffs = c(0.05, 0.01, 0.001))

################################################################################################


## Table 3: Summary of Code Findings

main_summaries <- conf_data %>% 
  summarize(Count = round(n(), 0),
            `Mention Identity` = mean(mention_id, na.rm=T),
            `Mention Gender` = mean(mention_gender, na.rm=T),
            `Mention Race` = mean(mention_race, na.rm=T),
            `Mention Sexual Harassment` = mean(mention_sex_har, na.rm=T),
            `Define Discrimination` = mean(disc_def_bin, na.rm=T),
            `Define Harassment` = mean(harass_def_bin, na.rm=T),
            `Define Sexual Harassment` = mean(sex_har_def_bin, na.rm=T),
            `Mention Prohibited Behaviors` = mean(prohib_bin, na.rm=T),
            `Mention Bystander Accountability` = mean(mention_bystander, na.rm=T),
            `Ombuds` = mean(ombuds, na.rm=T),
            `General Committee` = mean(committee_gen, na.rm=T),
            `Sexual Harassment Committee` = mean(committee_sex, na.rm=T),
            `General Reporting Mechanism` = mean(gen_reporting, na.rm=T),
            `Sexual Harassment Reporting Mechanism` = mean(sex_har_reporting, na.rm=T),
            `Explains Who Receives Reports` = mean(report_who, na.rm=T),
            `Reports Received Externally` = mean(report_unbiased, na.rm=T),
            `Adjudication Process` = mean(adjud_process, na.rm=T),
            `Adjudication Confidentiality` = mean(adjud_confid, na.rm=T),
            `Law Enforcement` = mean(law_police, na.rm=T),
            `Appeal` = mean(rights_accused, na.rm=T),
            `Consequences` = mean(conseq_bin, na.rm=T),
            `Clear Definition Composite` = mean(clear_def, na.rm=T))

### clear definition composite
clear_def_stat <- main_summaries %>% 
  select(`Clear Definition Composite`) %>% 
  pull

clear_def_stat

### Reshape to a pretty table
pretty_table <- main_summaries %>% 
  select(-Count, -`Clear Definition Composite`) %>% 
  pivot_longer(`Mention Identity`:`Consequences`, 
               names_to = "Variable", 
               values_to = "Percent of Codes") %>% 
  mutate(`Percent of Codes` = as.character(100*round(`Percent of Codes`, 2))) %>% 
  add_row(Variable = "Clear Definitions", `Percent of Codes` = "", .before = 1) %>% 
  add_row(Variable = "Clear Reporting", `Percent of Codes` = "", .before = 10) %>% 
  add_row(Variable = "Clear Adjudicating", `Percent of Codes` = "", .before = 19)

###
tex_table <- xtable(pretty_table, caption = "Summary of Code Findings",
                    label = "tab:summary_stats")

print(tex_table, include.rownames = F)
## to save the tex table
#print.xtable(tex_table, file="summary_stats.tex",
#             include.rownames = F)

##############################################################################################

#  Figure 5 in Appendix C for a wordcloud of listed identities 

library(ggwordcloud)

## identities 
ident_text <- conf_data %>% 
  select(number, id_list) %>% 
  filter(!is.na(id_list)) %>% 
  separate_longer_delim(id_list, delim = ",") %>% 
  separate_longer_delim(id_list, delim = regex("\\bor\\b")) %>% 
  separate_longer_delim(id_list, delim = regex("\\band\\b")) %>%
  separate_longer_delim(id_list, delim = regex("/")) %>%
  separate_longer_delim(id_list, delim = regex(";")) %>%
  ## drop the term "perceived"
  mutate(id_terms = str_replace(id_list, "perceived", "")) %>% 
  ### get rid of extra whitespace
  mutate(id_terms = str_squish(id_terms),
         ## recode floating "expression" to "gender expression"
         id_terms = case_when(id_terms == "expression"~ "gender expression",
                              id_terms == "academic"~ "academic status",
                              id_terms == "sex (including pregnancy)"~ "sex",
                              id_terms == "sexism"~ "sex",
                              id_terms == "sexual"~ "sexual orientation",
                              id_terms == "homophobia"~ "sexual orientation",
                              id_terms == "socio-economic background"~ "socioeconomic status",
                              str_detect(id_terms, "characteristic protected") ~ "legally protected characteristic",
                              id_terms == "health conditions" ~ "health",
                              id_terms == "health condition" ~ "health",
                              str_detect(id_terms, "any personal characteristic") ~ "personal characteristc",
                              id_terms == "domestic" ~ "domestic status",
                              id_terms == "fatphobia" ~ "body size",
                              id_terms == "immigrant status" ~ "immigration status",
                              id_terms == "marital" ~ "marital status",
                              id_terms == "racism" ~ "race",
                              id_terms == "transphobia" ~ "gender identity",
                              id_terms == "xenophobia" ~ "immigration status",
                              id_terms == "religion (creed)" ~ "religion",
                              id_terms == "religious affiliation" ~ "religion",
                              TRUE ~ id_terms)) %>%
  ## drop blank terms and single phrase "actual"
  filter(!id_terms == "", 
         !id_terms == "actual")


## summaries of the identities
ident_summaries <- ident_text %>% 
  group_by(id_terms) %>% 
  summarize(count = n()) %>% 
  arrange(-count)

## word cloud
ident_summaries %>%
  ggplot() + 
  geom_text_wordcloud_area(aes(label = id_terms, size = count)) +
  scale_size_area(max_size = 15)

##############################################################################################

## Table 4 in Appendix C
## Table 4 shows the number of codes that contained 
## each of the methods for reporting violations.

reporting_num <- conf_data %>% 
  select(number, report_how) %>% 
  filter(!is.na(report_how))

## Counts of the type of reporting
reporting_texts <- conf_data %>% 
  select(number, report_how) %>% 
  filter(!is.na(report_how)) %>% 
  separate_longer_delim(report_how, delim = ",") %>% 
  ### get rid of extra whitespace
  mutate(report_how = str_squish(report_how)) %>% 
  group_by(report_how) %>% 
  summarise(count = n(),
  ) %>% 
  arrange(-count) %>% 
  rename(`Method of Reporting` = report_how,
         Count = count)

## to text
tex_table_reporting <- xtable(reporting_texts, 
                              caption = "Summary of Types of Reporting",
                              label = "tab:reporting_counts")

print(tex_table_reporting, include.rownames = F)

##############################################################################################

# Figure 1: Distribution of Combined Content Scores, Dashed Line is Mean (p. 15)
## scorecard analysis
## subtract those who have codes
conf_data_score <- conf_data[complete.cases(conf_data$coder),]


# Clear definition (#6)
## Drop the “mention gender” and “mention race”
conf_data_score <- conf_data_score %>%
  rowwise() %>%
  mutate(clear_def_score = sum(c_across(c("mention_gender",
                                          "prohib_bin", "disc_def_bin", "harass_def_bin", 
                                          "mention_sex_har", "sex_har_def_bin")), na.rm=T))
mean(conf_data_score$clear_def_score) #3.47
sum(conf_data_score$clear_def_score>2) 
24/34 #0.71

# Clear reporting (#7)
## Drop Sexual Harassment Committee
conf_data_score$report_how_bin <- ifelse(conf_data_score$report_how=="NA", 0, 1)

conf_data_score <- conf_data_score %>%
  rowwise() %>%
  mutate(clear_rep_score = sum(c_across(c("gen_reporting", "sex_har_reporting", "ombuds",
                                          "report_who", "committee_gen", "committee_sex", 
                                          "report_unbiased", "report_how_bin")), na.rm=T))

# Clear adjudicating (#5)

conf_data_score <- conf_data_score %>%
  rowwise() %>%
  mutate(clear_adj_score = sum(c_across(c("adjud_process", "adjud_confid", "law_police",
                                          "rights_accused", "conseq_bin")), na.rm=T))

# Total score out of 18

conf_data_score <- conf_data_score %>%
  rowwise() %>%
  mutate(clear_total_score = sum(c_across(c("clear_def_score", "clear_rep_score", "clear_adj_score")), na.rm=T))

##
sum(conf_data_score$clear_total_score>9.5) #23
length(conf_data_score$clear_total_score) #34
mean(conf_data_score$clear_total_score) #9.5
###########################################################################
###########################################################################

(34-23)/34

sum(conf_data_score$clear_total_score==14) #6
sum(conf_data_score$clear_total_score==13) #6

## ggplot

total_score_fig <- conf_data_score %>%
  ggplot(aes(x=clear_total_score)) +
  geom_histogram(binwidth=1, color = "#000000", fill = "#0099F8", alpha=0.9) +
  labs(x="Total Score (out of 18)",
       y="Counts") +
  scale_x_continuous(limits = c(0, 18)) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10, face = "bold"),
    plot.caption = element_text(face = "italic"),
    text = element_text(size = 16)
  ) + 
  geom_vline(xintercept=mean(conf_data_score$clear_total_score), linetype="dashed", 
             color = "red", size=2) 

total_score_fig

### writing out
#ggsave("total_score_dis.jpg", width = 9, height = 8, dpi=800)
#ggsave("total_score_dis.tiff", width = 9, height = 8)

###########################################################################

## scores on the three dimensions for Figure 2
def_score_fig <- conf_data_score %>%
  ggplot(aes(x=clear_def_score)) +
  geom_histogram(binwidth=0.5, color = "#000000", fill = "#0099F8") +
  labs(x="Clear definition score (out of 6)",
       y="Counts") +
  scale_y_continuous(breaks=c(0,3,6,9,12,15)) +
  scale_x_continuous(breaks=seq(0,7)) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10, face = "bold"),
    plot.caption = element_text(face = "italic"),
    text = element_text(size = 16)
  )
def_score_fig

sum(conf_data_score$clear_def_score >= 3)/length(conf_data_score$clear_total_score)


###########################################################################

rep_score_fig <- conf_data_score %>%
  ggplot(aes(x=clear_rep_score)) +
  geom_histogram(binwidth=0.5, color = "#000000", fill = "#0099F8") +
  labs(x="Clear reporting score (out of 7)",
       y="Counts") +
  scale_y_continuous(breaks=c(0,3,6,9,12,15)) +
  scale_x_continuous(breaks=seq(0,7), limits=c(-0.5,7)) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10, face = "bold"),
    plot.caption = element_text(face = "italic"),
    text = element_text(size = 16)
  )
rep_score_fig

###########################################################################

adj_score_fig <- conf_data_score %>%
  ggplot(aes(x=clear_adj_score)) +
  geom_histogram(binwidth=0.5, color = "#000000", fill = "#0099F8") +
  labs(x="Clear enforcement/adjudicating score (out of 5)",
       y="Counts") +
  scale_y_continuous(breaks=c(0,3,6,9,12,15)) +
  scale_x_continuous(breaks=seq(0,7)) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10, face = "bold"),
    plot.caption = element_text(face = "italic"),
    text = element_text(size = 16)
  )
adj_score_fig

sum(conf_data_score$clear_adj_score == 5)


###########
#install.packages("ggpubr")
library(ggpubr)

## Figure 2
plot_combined <- ggarrange(def_score_fig, rep_score_fig,
                           adj_score_fig, ncol=1, nrow=3)
plot_combined
#ggsave("plot_combined.tiff", width = 9, height = 8)
#######################################################
